/* Build *UPDATED* auxiliary data sets
	* PD ratio
		do "do/auxiliary_files/data_construction_PD_ratio.do"
		do "do/auxiliary_files/data_construction_PD_ratio_updated.do"
	* Sentiment
		do "do/auxiliary_files/data_construction_sentiment.do"
		do "do/auxiliary_files/data_construction_sentiment_updated.do"
	* FD
		do "do/auxiliary_files/data_construction_FD.do"
*/	


* THIS VERSION: includes ALL countries from the OECD database, independently of whether PD is available (or odd, in the case of Japan)
* NOTE: the OECD database also includes some non-OECD countries, such as Brazil, China, and Russia, among others


***************************
* Combining all data sets *
***************************

* Import R&D data from Excel to Stata
	clear all
	import excel "orig/rd.xlsx", sheet("final_series") firstrow
	destring cou, replace
	egen id = group(cou)
	xtset id year
	gen g_rd = ln(rd) - ln(l.rd)
	drop id
	save "data/rd.dta", replace
	
* Merge Penn data with sentiment and US macro data
	clear all
	use "orig/penn_world_table/penn_world_table_data.dta"
	rename countrycode cou
	// merge
		// PD ratio
			merge m:m cou year using "data/pd.dta"
			drop _merge
		// local sentiment data
			merge m:m cou year using "data/local_sentiments_panel_y.dta"
			drop _merge
		// local sentiment data measured in December
			merge m:m cou year using "data/local_sentiments_panel_y_m12.dta"
			drop _merge
		// aggregate sentiment data
			merge m:m year using "data/main_sentiments_y.dta"
			drop _merge
		// US eight principal components from Ludvingson and Ng (2009)
			merge m:m year using "data/macro_forecasting_factors_updated_annual.dta"
			drop _merge
		// rd data
			merge m:m cou year using "data/rd.dta"
			drop _merge
	// set data as panel
		egen id = group(cou)
		xtset id year

* Generate macro variables
	// TFP
		// calculate t-step-ahead TFP growth
			foreach x in rtfpna rwtfpna ctfp cwtfp g_rd {
				gen f_`x' = f.`x'
				gen f2_`x' = f2.`x'
				gen f3_`x' = f3.`x'
				gen f4_`x' = f4.`x'
				gen f5_`x' = f5.`x'
				gen f6_`x' = f6.`x'
				gen f7_`x' = f7.`x'
				gen f8_`x' = f8.`x'				
			}
			*** NOTE: the definition of TFP growth (or "RTFP") is in Eq.(5) of "pwt80_capital_labor_and_TFP")
		// summary statistics
			tabstat rtfpna rwtfpna ctfp cwtfp g_rd, statistics(mean sd min med max)
			pwcorr rtfpna rwtfpna ctfp cwtfp g_rd, sig
	// macro variables
		// generate consumption and capital
			// real household consumption
				gen rhc = csh_c * rgdpna
			// real gross capital formation
				gen rgcf = csh_i * rgdpna
		// calculate growth rates
			foreach x in rgdpna emp rhc labsh pl_con rconna rgcf rnna rdana irr {
				gen d_`x' = d.`x'
				gen g_`x' = ln(`x') - ln(l.`x')
			}
		// calculate future values
			foreach x in g_rgcf irr d_irr g_rhc g_emp g_rgdpna {
				gen f_`x' = f.`x'
				gen f2_`x' = f2.`x'
				gen f3_`x' = f3.`x'
				gen f4_`x' = f4.`x'
				gen f5_`x' = f5.`x'
				gen f6_`x' = f6.`x'
				gen f7_`x' = f7.`x'
				gen f8_`x' = f8.`x'				
			}
	// GDP shares
		// net exports 
			gen csh_nx = csh_x - csh_m
		// changes
			foreach x in csh_c csh_i csh_nx csh_x csh_m {
			gen d_`x' = d.`x'
			gen f_d_`x' = fd.`x'			
			}		
	// rescale and express in changes
		tabstat rgdpna rconna emp, statistics(mean sd med)
		gen rgdpna_1tr = rgdpna / 1000000
		gen rconna_1tr = rconna / 1000000
		tabstat rgdpna_1tr rconna_1tr, statistics(mean sd min med max)
		foreach x in rgdpna_1tr rconna_1tr {
			gen d_`x' = d.`x'
		}
				
* Dummy for G7 countries (USA, CAN, FRA, DEU, ITA, JPN, GBR)
	// including the U.S.
		gen dum_G7 = (cou == "USA" | cou == "CAN" | cou == "FRA" | cou == "DEU" | cou == "ITA" | cou == "JPN" | cou == "GBR")
	// excluding the U.S.
		gen dum_G7n = (cou == "CAN" | cou == "FRA" | cou == "DEU" | cou == "ITA" | cou == "JPN" | cou == "GBR")
	
* Consumer sentiment
	// country-level
		// orthogonalized
			// changes
				qui: xtreg cci d_rgdpna d_emp d_rconna d_labsh d_pl_con i.year, robust fe
				predict cci_ort, r
			// changes, measured in December
				qui: xtreg cci_m12 d_rgdpna d_emp d_rconna d_labsh d_pl_con i.year, robust fe
				predict cci_ort_m12, r
			// changes, excluding the U.S.
				qui: xtreg cci d_rgdpna d_emp d_rconna d_labsh d_pl_con i.year if cou != "USA", robust fe
				predict cci_ort1, r
		// explained
			gen cci_exp = cci - cci_ort
			gen cci_exp_m12 = cci_m12 - cci_ort_m12
			gen cci_exp1 = cci - cci_ort1 if cou != "USA"
		// standardized
			foreach x in cci cci_ort cci_exp cci_m12 cci_ort_m12 cci_exp_m12 cci_ort1 cci_exp1 {
			egen `x'mean = mean(`x')
			egen `x'sd = sd(`x')
			gen z`x' = (`x'-`x'mean) / `x'sd
			drop `x'sd `x'mean
			}
		// restrict sample to consumer sentiment subsample
			keep if cci_ort != .
			save "data/final_data_set_extended.dta", replace
	// specific geographic areas
		// U.S.
			clear all
			use "data/final_data_set_extended.dta"
			keep if cou == "USA"
			tsset year
			reg cci d_rgdpna d_emp d_rconna d_labsh d_pl_con, robust
			predict cci_USA_ort, r
			keep cci_USA_ort year
			save "data/cci_USA_ort.dta", replace
			clear all
			use "data/final_data_set_extended.dta"
			merge m:m year using "data/cci_USA_ort.dta"
			drop _merge
			save "data/final_data_set_extended.dta", replace
			erase "data/cci_USA_ort.dta"
		// G7 countries
			clear all
			use "data/final_data_set_extended.dta"
			keep if dum_G7 == 1
			egen cci_G7_ort = rowmean(cci_ort)
			collapse cci_G7_ort, by(year)
			save "data/cci_G7_ort.dta", replace
			clear all
			use "data/final_data_set_extended.dta"
			merge m:m year using "data/cci_G7_ort.dta"
			drop _merge
			save "data/final_data_set_extended.dta", replace
			erase "data/cci_G7_ort.dta"
		// G7 countries except the U.S.
			clear all
			use "data/final_data_set_extended.dta"
			keep if dum_G7n == 1
			egen cci_G7n_ort = rowmean(cci_ort)
			collapse cci_G7n_ort, by(year)
			save "data/cci_G7n_ort.dta", replace
			clear all
			use "data/final_data_set_extended.dta"
			merge m:m year using "data/cci_G7n_ort.dta"
			drop _merge
			save "data/final_data_set_extended.dta", replace
			erase "data/cci_G7n_ort.dta"
		// All OECD countries
			clear all
			use "data/final_data_set_extended.dta"
			egen cci_OECD_ort = rowmean(cci_ort)
			collapse cci_OECD_ort, by(year)
			save "data/cci_OECD_ort.dta", replace
			clear all
			use "data/final_data_set_extended.dta"
			merge m:m year using "data/cci_OECD_ort.dta"
			drop _merge
			save "data/final_data_set_extended.dta", replace
			erase "data/cci_OECD_ort.dta"
		
	save "data/final_data_set_extended.dta", replace

* Standardize macro variables
	// local
		foreach x in d_rgdpna_1tr d_emp d_rconna_1tr d_labsh d_pl_con {
			egen `x'mean = mean(`x')
			egen `x'sd = sd(`x')
			gen z`x' = (`x'-`x'mean) / `x'sd
			drop `x'sd `x'mean
		}
	// US
		foreach x in F1 F2 F3 F4 F5 F6 F7 F8 {
			egen `x'mean = mean(`x')
			egen `x'sd = sd(`x')
			gen z`x' = (`x'-`x'mean) / `x'sd
			drop `x'sd `x'mean
		}
	
* Financial development
	// fixed and predetermined
		// merge with main data set
			merge m:m country using "data/FD.dta"
			drop _merge
		// correlation between G7 countries and FD
			pwcorr dum_G7 fd1 fd2 fd3 fd4 fd0, sig obs
		// standardize
			foreach x in fd1 fd2 fd3 fd4 fd5 fd6 fd0 {
				egen `x'mean = mean(`x')
				egen `x'sd = sd(`x')
				gen z`x' = (`x'-`x'mean) / `x'sd
				drop `x'sd `x'mean
			}	
		// create interaction terms
			foreach x in zcci_ort zcci_exp {
				foreach y in fd1 fd2 fd3 fd4 fd5 fd6 fd0 {
					gen `x'_z`y' = `x' * z`y'
				}
			}
		// FD measures with the most observations
			tabulate country if fd0raw != .
			tabulate country if fd2raw != . /* data for Ireland only becomes available in 1995 */

	// time-varying
		// merge with main data set
			merge m:m country year using "data/vFD.dta"
			drop _merge
			duplicates list id year
			duplicates drop id year, force
		// correlation between G7 countries and FD
			pwcorr dum_G7 vfd1 vfd2 vfd3 vfd4 vfd0, sig obs
		// standardize
			foreach x in vfd1 vfd2 vfd3 vfd4 vfd5 vfd6 vfd0 {
				egen `x'mean = mean(`x')
				egen `x'sd = sd(`x')
				gen z`x' = (`x'-`x'mean) / `x'sd
				drop `x'sd `x'mean
			}	
		// create interaction terms
			foreach x in zcci_ort zcci_exp {
				foreach y in vfd1 vfd2 vfd3 vfd4 vfd5 vfd6 vfd0 {
					gen `x'_z`y' = `x' * z`y'
				}
			}
		// FD measures with the most observations
			tabulate country if vfd0raw != .
			tabulate country if vfd2raw != . /* data for Ireland only becomes available in 1995 */
			
	// avoid clutter and delete missing observations
		drop if year == .
	
	save "data/final_data_set_extended.dta", replace
	clear all
